# read data
raw_data <- read_csv("data/heart_failure.csv")
## Rows: 299 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (13): TIME, Event, Gender, Smoking, Diabetes, BP, Anaemia, Age, Ejection...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# clean variable names
raw_data <- raw_data |>
  arrange(TIME) |>
  janitor::clean_names()

## data cleaning, create data frame for model
model_data <- raw_data |>
  mutate(gender = factor(gender),
         smoking = factor(smoking),
         diabetes = factor(diabetes),
         bp = factor(bp),
         # event = factor(event),
         anaemia = factor(anaemia),
         logcre=log(creatinine+1),
         logcpk = log(cpk+1)) |>
  rename(pletelets = pletelets)|>
  mutate(ef_cat = factor(case_when(
    ejection_fraction <= 30 ~ "Low",
    ejection_fraction > 30 & ejection_fraction < 45 ~ "Medium",
    ejection_fraction >= 45 ~ "High"),
    levels = c("Low", "Medium", "High")))
model_data
## # A tibble: 299 × 16
##     time event gender smoking diabe…¹ bp    anaemia   age eject…² sodium creat…³
##    <dbl> <dbl> <fct>  <fct>   <fct>   <fct> <fct>   <dbl>   <dbl>  <dbl>   <dbl>
##  1     4     1 1      0       0       1     0          75      20    130     1.9
##  2     6     1 1      0       0       0     0          55      38    136     1.1
##  3     7     1 1      1       0       0     0          65      20    129     1.3
##  4     7     1 1      0       0       0     1          50      20    137     1.9
##  5     8     1 0      0       1       0     1          65      20    116     2.7
##  6     8     1 1      1       0       1     1          90      40    132     2.1
##  7    10     1 1      0       0       0     1          75      15    137     1.2
##  8    10     1 1      1       1       0     1          60      60    131     1.1
##  9    10     1 0      0       0       0     0          65      65    138     1.5
## 10    10     1 1      1       0       1     1          80      35    133     9.4
## # … with 289 more rows, 5 more variables: pletelets <dbl>, cpk <dbl>,
## #   logcre <dbl>, logcpk <dbl>, ef_cat <fct>, and abbreviated variable names
## #   ¹​diabetes, ²​ejection_fraction, ³​creatinine
km.fit.gender=survfit(Surv(time, event)~ gender, data=model_data)

km.fit.bp=survfit(Surv(time, event)~ bp, data=model_data)

km.fit.smoke=survfit(Surv(time, event)~ smoking, model_data)

km.fit.diabetes=survfit(Surv(time, event)~ diabetes, model_data)

km.fit.anaemia=survfit(Surv(time, event)~ anaemia, model_data)

km.fit.ef_cat=survfit(Surv(time, event)~ ef_cat, model_data)

km.fit.age=survfit(Surv(time, event)~ age, model_data)

km.fit.pletelets=survfit(Surv(time, event)~ pletelets, model_data)

km.fit.logcre=survfit(Surv(time, event)~ logcre, model_data)

km.fit.logcpk=survfit(Surv(time, event)~ logcpk, model_data)
ggsurvplot(
  km.fit.gender,
  fun = "cumhaz",
  xlab = "Time",
  ylab = "-log(S(t))",
  title = "Negative Log of Estimated Survival Functions for Gender"
)

ggsurvplot(
  km.fit.gender,
  fun = "cloglog",
  xlab = "Log(time)",
  ylab = "log(-log(S(t)))",
  title = "Log of Negative Log of Estimated Survival Function for Gender"
)

ggsurvplot(
  km.fit.bp,
  fun = "cumhaz",
  xlab = "Time",
  ylab = "-log(S(t))",
  title = "Negative Log of Estimated Survival Functions for Blood Pressure"
)

ggsurvplot(
  km.fit.bp,
  fun = "cloglog",
  xlab = "Log(time)",
  ylab = "log(-log(S(t)))",
  title = "Log of Negative Log of Estimated Survival Function for Blood Pressure"
)

ggsurvplot(
  km.fit.smoke,
  fun = "cumhaz",
  xlab = "Time",
  ylab = "-log(S(t))",
  title = "Negative Log of Estimated Survival Functions for Diabetes"
)

ggsurvplot(
  km.fit.smoke,
  fun = "cloglog",
  xlab = "Log(time)",
  ylab = "log(-log(S(t)))",
  title = "Log of Negative Log of Estimated Survival Function for Diabetes"
)

ggsurvplot(
  km.fit.diabetes,
  fun = "cumhaz",
  xlab = "Time",
  ylab = "-log(S(t))",
  title = "Negative Log of Estimated Survival Functions for Diabetes"
)

ggsurvplot(
  km.fit.diabetes,
  fun = "cloglog",
  xlab = "Log(time)",
  ylab = "log(-log(S(t)))",
  title = "Log of Negative Log of Estimated Survival Function for Diabetes"
)

ggsurvplot(
  km.fit.anaemia,
  fun = "cumhaz",
  xlab = "Time",
  ylab = "-log(S(t))",
  title = "Negative Log of Estimated Survival Functions for Anaemia"
)

ggsurvplot(
  km.fit.anaemia,
  fun = "cloglog",
  xlab = "Log(time)",
  ylab = "log(-log(S(t)))",
  title = "Log of Negative Log of Estimated Survival Function for Anaemia"
)

ggsurvplot(
  km.fit.ef_cat,
  fun = "cumhaz",
  xlab = "Time",
  ylab = "-log(S(t))",
  title = "Negative Log of Estimated Survival Functions for Ejection Fraction"
)

ggsurvplot(
  km.fit.ef_cat,
  fun = "cloglog",
  xlab = "Log(time)",
  ylab = "log(-log(S(t)))",
  title = "Log of Negative Log of Estimated Survival Function for Ejection Fraction"
)

ggsurvplot(
  km.fit.age,
  fun = "cumhaz",
  xlab = "Time",
  ylab = "-log(S(t))",
  title = "Negative Log of Estimated Survival Functions for Ejection Fraction"
)

ggsurvplot(
  km.fit.age,
  fun = "cloglog",
  xlab = "Log(time)",
  ylab = "log(-log(S(t)))",
  title = "Log of Negative Log of Estimated Survival Function for Ejection Fraction"
)

ggsurvplot(
  km.fit.age,
  fun = "cumhaz",
  xlab = "Time",
  ylab = "-log(S(t))",
  title = "Negative Log of Estimated Survival Functions for Age"
)

ggsurvplot(
  km.fit.age,
  fun = "cloglog",
  xlab = "Log(time)",
  ylab = "log(-log(S(t)))",
  title = "Log of Negative Log of Estimated Survival Function for Age"
)

ggsurvplot(
  km.fit.pletelets,
  fun = "cumhaz",
  xlab = "Time",
  ylab = "-log(S(t))",
  title = "Negative Log of Estimated Survival Functions for Pletelets"
)

ggsurvplot(
  km.fit.pletelets,
  fun = "cloglog",
  xlab = "Log(time)",
  ylab = "log(-log(S(t)))",
  title = "Log of Negative Log of Estimated Survival Function for Pletelets"
)

ggsurvplot(
  km.fit.logcre,
  fun = "cumhaz",
  xlab = "Time",
  ylab = "-log(S(t))",
  title = "Negative Log of Estimated Survival Functions for log(cre)"
)

ggsurvplot(
  km.fit.logcre,
  fun = "cloglog",
  xlab = "Log(time)",
  ylab = "log(-log(S(t)))",
  title = "Log of Negative Log of Estimated Survival Function for log(cre)"
)

ggsurvplot(
  km.fit.logcpk,
  fun = "cumhaz",
  xlab = "Time",
  ylab = "-log(S(t))",
  title = "Negative Log of Estimated Survival Functions for log(cpk)"
)

ggsurvplot(
  km.fit.logcpk,
  fun = "cloglog",
  xlab = "Log(time)",
  ylab = "log(-log(S(t)))",
  title = "Log of Negative Log of Estimated Survival Function for log(cpk)"
)

library(patchwork)

The Cox Proportional Hazards (PH) Model operates under two primary assumptions: 1)Survival Cures for different strata must have proportional hazard function over time 2)The relationship between log hazard and each covariate is linear. By graphical approach, first assume that gender is the only covariate in the model, then take log of survival function and plot against time to estimate cumulative hazard function. From the figure(), The log survival function for the two genders that was estimated by the KM estimator are almost a straight line. From the figure(), the log log of survival function and plot against log time demonstrates a reasonable straight line with slope is 1. It can be concluded that the model with gender as the only covariates are likely to be exponential distributed. From the figure(), a plot that shows the differences between the observed KM estimates and fitted survival function for the PH model that has only gender.